source R profile. Memory was set to 500000.
Sys.setenv("R_ENVIRON_USER"='/Users/castilln/.Renviron')
Sys.getenv("R_ENVIRON_USER")
[1] "/Users/castilln/.Renviron"
CCLE_expression <- fread("/Users/castilln/Desktop/thesis/localdata/depmap/CCLE_expression.csv", header = TRUE)
#rows: tumor sample barcode
#column: genes
#values: expression (TPM) already log2 transformed
sample_info <- read_csv("/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv") #metadata
── Column specification ──────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
COSMICID = col_double(),
Achilles_n_replicates = col_double(),
cell_line_NNMD = col_double(),
cas9_activity = col_double(),
WTSI_Master_Cell_ID = col_double(),
age = col_double(),
depmap_public_comments = col_logical()
)
ℹ Use `spec()` for the full column specifications.
11 parsing failures.
row col expected actual file
1105 depmap_public_comments 1/0/T/F/TRUE/FALSE Adherent version of CCLFPEDS0001T '/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv'
1166 depmap_public_comments 1/0/T/F/TRUE/FALSE SV40+TERT immortalized kidney cell line '/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv'
1455 depmap_public_comments 1/0/T/F/TRUE/FALSE Drug resistance: Dabrafenib and Trametinib '/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv'
1456 depmap_public_comments 1/0/T/F/TRUE/FALSE Drug resistance: SCH772984 '/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv'
1457 depmap_public_comments 1/0/T/F/TRUE/FALSE Drug resistance: Dabrafenib and Trametinib '/Users/castilln/Desktop/thesis/localdata/depmap/sample_info.csv'
.... ...................... .................. .......................................... .................................................................
See problems(...) for more details.
CCLE_mutations <- fread("/Users/castilln/Desktop/thesis/localdata/depmap/CCLE_info", header = TRUE) #mutations and sample info
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
mutations_spliceosome = fread("/Users/castilln/Desktop/thesis/localdata/depmap/mutations_spliceosome.csv")
Contingency table - Annotation of cell lines with spliceosome mutations
table(distinct(cell_lines_list_mutated)$primary_disease,distinct(cell_lines_list_mutated)$spliceosome_mutated)
NO YES
Adrenal Cancer 0 1
Bile Duct Cancer 3 33
Bladder Cancer 0 39
Bone Cancer 10 65
Brain Cancer 8 99
Breast Cancer 16 66
Cervical Cancer 0 22
Colon/Colorectal Cancer 4 79
Embryonal Cancer 0 3
Endometrial/Uterine Cancer 1 38
Engineered 9 5
Esophageal Cancer 0 38
Eye Cancer 0 9
Fibroblast 5 38
Gallbladder Cancer 0 7
Gastric Cancer 2 47
Head and Neck Cancer 4 72
Kidney Cancer 6 50
Leukemia 9 123
Liposarcoma 0 11
Liver Cancer 0 27
Lung Cancer 12 261
Lymphoma 4 105
Myeloma 1 33
Neuroblastoma 1 45
Non-Cancerous 2 3
Ovarian Cancer 5 69
Pancreatic Cancer 8 51
Prostate Cancer 0 13
Rhabdoid 1 19
Sarcoma 7 35
Skin Cancer 7 106
Teratoma 0 1
Thyroid Cancer 0 21
Unknown 9 36
#make sure expression data is normally distributed
#central limit theorem: sample size large enough to assume normal distribution
CCLE_expression =
CCLE_expression %>%
rename("DepMap_ID" = "V1")
#join expression data with metadata
long_CCLE_expression =
CCLE_expression %>%
pivot_longer(cols=-DepMap_ID, names_to = "Gene", values_to = "TPM") %>%
left_join(cell_lines_list_mutated, by = "DepMap_ID")
#Lets see the distribution of the expression data
library(ggdist)
ggplot(long_CCLE_expression, aes(x=TPM)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5) +
geom_density(alpha=0.6)
Filter out those genes whose median for expression is lower than 1 across all samples
median_CCLE_expression =
long_CCLE_expression %>%
group_by(Gene) %>%
dplyr::mutate(median = median(TPM, na.rm=TRUE)) %>%
filter(median >= 1)
Plot again
ggplot(median_CCLE_expression, aes(x=TPM)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5) +
geom_density(alpha=0.6)
After joinin with the expression data, let’s see how many cell lines we have mutated or not mutated
median_CCLE_expression =
median_CCLE_expression %>%
rename("cell_line" = "stripped_cell_line_name")
median_CCLE_expression %>%
ungroup() %>%
select(primary_disease, cell_line, spliceosome_mutated) %>%
distinct() %>%
group_by(primary_disease, spliceosome_mutated) %>%
tally()
median_CCLE_expression %>%
ungroup %>%
select(cell_line) %>%
distinct() %>% dim()
[1] 1369 1
Let’s keep only those cancers with a relatively significant number of cell lines w/o mutations in the spliceosome:
keep <- c("Bone Cancer", "Pancreatic Cancer", "Leukemia")
df_deseq =
median_CCLE_expression %>%
filter(primary_disease %in% keep)
#filt_CCLE_expression %>%
df_deseq %>%
select(DepMap_ID, Gene, TPM, spliceosome_mutated) %>%
filter(spliceosome_mutated == "NO") %>%
select(c(Gene, TPM))-> no_t_expression
#head(no_t_expression)
#filt_CCLE_expression %>%
df_deseq %>%
select(DepMap_ID, Gene, TPM, spliceosome_mutated) %>%
filter(spliceosome_mutated == "YES") %>%
select(c(Gene, TPM)) -> yes_t_expression
#head(yes_t_expression)
t.test(no_t_expression$TPM, yes_t_expression$TPM, alternative = "two.sided", var.equal = FALSE)
Welch Two Sample t-test
data: no_t_expression$TPM and yes_t_expression$TPM
t = 7.0946, df = 250114, p-value = 1.3e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.02553765 0.04503382
sample estimates:
mean of x mean of y
4.100692 4.065406
A DGElist object conveniently stores count matrix, sample metadata and gene annotation as one object for easy manipulation.
df_counts = df_deseq %>%
select(cell_line, Gene, TPM) %>%
pivot_wider(names_from = cell_line, values_from = TPM) %>%
as.data.frame()
#STORE FIRST COLUMN AS ROWNAMES
rownames(df_counts) = df_counts[,1]
rownames(df_counts) <- gsub("\\s*\\([^\\)]+\\)","",as.character(rownames(df_counts)))
#REMOVE FIRST COLUMN
df_counts = df_counts[,-1]
##THERE ARE NA VALUES IN DF_COUNTS. SUBSTITUTE NA VALUES BY THE MEAN EXPRESSION FOR THAT CELL LINE
for(i in 1:ncol(df_counts)){
df_counts[is.na(df_counts[,i]), i] <- mean(df_counts[,i], na.rm = TRUE)
}
meta =
df_deseq %>%
ungroup() %>%
select(-c("TPM", "median", "Gene")) %>%
distinct()
Reorder count columns by meta ID
##REORDER COLUMNS BY SAMPLE INFO
df_counts = df_counts[meta$cell_line]
head(df_counts)
Create grouping factor
meta =
meta %>%
mutate(group = as.factor(spliceosome_mutated)) %>%
select(-spliceosome_mutated) %>%
mutate(primary_disease = as.factor(primary_disease))
Create gene annotation
#SOMETIMES PROBLEMS TO LOAD THIS LIBRARIES BECAUSE INCOMPATIBILITIES W/ COMPLEX HEATMAP: START NEW R SESSION
library("AnnotationDbi")
library("org.Hs.eg.db")
gene_annot = AnnotationDbi::mapIds(org.Hs.eg.db, keys = rownames(df_counts),
column = c("GENENAME"), keytype = ("SYMBOL")) %>%
as.data.frame()
'select()' returned 1:many mapping between keys and columns
colnames(gene_annot) = c("DESCRIPTION")
Change primary_disease names to something understandable for R as names
meta$primary_disease = gsub(" ", "_", meta$primary_disease)
meta$primary_disease = gsub("/", "_", meta$primary_disease)
dge = DGEList(counts = df_counts,
samples = meta,
genes = gene_annot)
summary(dge)
Length Class Mode
counts 2165430 -none- numeric
samples 6 data.frame list
genes 1 data.frame list
Filter low expressed genes
#keep = filterByExpr(dge, group = group)
#dge = dge[keep, , keep.lib.sizes = FALSE]
Calculate library normalization factors
dge <- calcNormFactors(dge)
dge$samples
plotMDS(dge)
Set up desing matrix
design = model.matrix(~0+group+primary_disease, data = dge$samples)
colnames(design)
[1] "groupNO" "groupYES" "primary_diseaseLeukemia"
[4] "primary_diseasePancreatic_Cancer"
#design
Calculate dispersions
dge = estimateDisp(dge, design)
fit <- glmQLFit(dge, design)
plotQLDisp(fit)
In order to compare the difference in cell lines with spliceosome mutations vs wt for all different cancers, we need to define some contrasts for the model.
my.contrasts <- makeContrasts(
YvsN = `groupYES`-`groupNO`,
levels = design
)
Now we are going to use the model contrasts to determine pairwise differential expression using the quasi-likelihood (QL)-F-test
qlf.spliceosome.YvsN <- glmQLFTest(fit, contrast=my.contrasts[,"YvsN"])
topTags(qlf.spliceosome.YvsN)
Coefficient: -1*groupNO 1*groupYES
Adjust p-values (FDR) and merge data into a single table to facilitate the analysis:
TidyQLF = function(qlf, contrast) {
df_qlf =
qlf$table %>%
as_tibble(rownames = "SYMBOL") %>%
mutate(FDR = p.adjust(PValue, method = "fdr"))
df_full =
cbind(qlf$genes,df_qlf) %>%
mutate(comparison = contrast) %>%
relocate(SYMBOL, .before = DESCRIPTION) %>%
relocate(comparison, .before = SYMBOL)
return(df_full)
}
tbl_YvsN = TidyQLF(qlf.spliceosome.YvsN,"YesvsNo")
head(tbl_YvsN)
Prepare for tSNE/PCA
##PIVOT WIDER
df_wide =
median_CCLE_expression %>%
select(DepMap_ID, Gene, TPM, spliceosome_mutated, primary_disease) %>%
pivot_wider(names_from = Gene, values_from = TPM) %>%
as.data.frame() %>%
relocate(DepMap_ID, .after = primary_disease)
expression_pca <- as.matrix(df_wide[,3:11400])
expression <- as.matrix(df_wide[,4:11400])
disease <- df_wide[, 2] #disease
mutated <- df_wide[, 1]
##GET RID OF NA VALUES AND SUBSTITUTE FOR MEAN EXPRESSION PER CELL LINE
for(i in 1:ncol(expression)){
expression[is.na(expression[,i]), i] <- mean(expression[,i], na.rm = TRUE)
}
Run PCA
pca1 = prcomp(expression, center = TRUE, scale = TRUE, na.action=na.omit(expression))
In prcomp.default(expression, center = TRUE, scale = TRUE, na.action = na.omit(expression)) :
extra argument ‘na.action’ will be disregarded
plotData = pca1$x[,1:2]
plotData = cbind(DepMap_ID = expression_pca[,1], plotData)
rownames(plotData) = NULL
head(plotData)
DepMap_ID PC1 PC2
[1,] "ACH-001113" "-25.8912390948755" "-11.8772754159592"
[2,] "ACH-001289" "-32.089833776787" "-2.23149492296967"
[3,] "ACH-001339" "15.0538289388188" "-24.0719270653194"
[4,] "ACH-001538" "25.7704229940894" "-16.4639435369173"
[5,] "ACH-000242" "-5.39230565201994" "1.88725601590978"
[6,] "ACH-000708" "10.3601145160347" "9.1981704143161"
ID = plotData[,1]
to_join =
median_CCLE_expression %>%
select(DepMap_ID, primary_disease, spliceosome_mutated)
Adding missing grouping variables: `Gene`
plotData %>%
as_tibble() %>%
left_join(to_join, by ="DepMap_ID") %>%
mutate(PC1 = as.double(PC1),
PC2 = as.double(PC2)) -> plotData_gg
Plot
ggplot(plotData_gg, aes(x = PC1,y = PC2, color = spliceosome_mutated)) +
geom_point() +
facet_wrap(vars(primary_disease), scales = "free") +
stat_ellipse() -> gg_expression_mutated
ggsave("/Users/castilln/Desktop/thesis/gg_expression_mutated.png", device = "png", width = 35, height = 20, units = "cm", dpi = "retina")
ggplot(plotData_gg, aes(x = PC1,y = PC2, color = primary_disease)) +
geom_point() +
scale_color_manual(values = c("gainsboro", 'forestgreen', 'red2', 'orange', 'cornflowerblue',
'magenta', 'darkolivegreen4', 'indianred1', 'tan4', 'darkblue',
'mediumorchid1', 'firebrick4', 'yellowgreen', 'lightsalmon', 'tan3',
"tan1", 'darkgray','wheat4', '#DDAD4B', 'chartreuse',
'seagreen1', 'moccasin', 'mediumvioletred', 'seagreen', 'cadetblue1',
"darkolivegreen1" ,"tan2" , "tomato3" , "#7CE3D8", "black", "yellow", "violetred", "blue")) -> gg_expression_disease
ggsave("/Users/castilln/Desktop/thesis/gg_expression_disease.png", device = "png", width = 25, height = 15, units = "cm", dpi = "retina")
grid.arrange(gg_expression_disease, gg_expression_mutated, ncol = 2)
NA
Run tSNE